1 Import data

library(tidyverse)
library(data.table)
library(ggplot2)
library(readxl)
library(cluster)
library(plotly)
library(fpc)

# Set seed
set.seed(12345)
# Set working directory
setwd("~/Desktop/Fall 2019 Academics/HS 256_Healthcare data analytics and Data Mining/HW4")

# inpatient discharge data, 
df_ip <- fread("VTINP16_upd.TXT")

# Revenue Code data
df_rev_code <- fread("VTREVCODE16.TXT") %>%  

# exclude the low dollar value services (less than $100) by dropping the REVCHRGS <100
# Note that with this filter 7 unique PCCR are  removed hence we only have 52 PCCRs
# in the proceeding analysis instead of the original 57
    filter(REVCHRGS >= 100) 
# DRG names: These can be found in FILE_LAYOUT_and_CODES -"MSDRG 2007 forward" tab
DRG_Names <- read_excel("DRG_PCCR_Names.xlsx", sheet = "DRG")
# PCCR names: These can be found in REVCODE_FILE_LAYOUT_and_CODES -"PCCR" tab
PCCR_Names <- read_excel("DRG_PCCR_Names.xlsx", sheet = "PCCR") %>% 
  mutate(PCCR = as.numeric(PCCR))

2 Data Preparation

  1. 1st output shows the dimensions of the cross tab: Compare that with the 681 by 55 required by Professor Rasavi
  2. 2nd output shows the first 10 observations of the cross tab.
 # Data Preparation --------------------------------------------------------
  df_ip_cluster <- df_ip %>% 
    # Filter your hospital admissions to only important DRGs between 20 and 977:
    # With this filter we lose 15 uniques DRGs
    filter(between(DRG, 20, 977)) %>% 
    # Link your filtered Inpatient data to the Revenue Code file using the UNIQ variable
    right_join(df_rev_code,  by = c("UNIQ" = "Uniq")) %>% 
    # Now sum all the charges by the PCCR category 
    group_by(UNIQ,DRG,PCCR) %>% 
    summarise(sum_charges = sum(REVCHRGS)) %>% 
    # cross tabulate your list of selected DRGs (in the row) and the mean value of the PCCRs, 
    # as cell value
    group_by(DRG, PCCR) %>% 
    summarise(avg_cost = round(mean(sum_charges),2)) %>% 
    filter(!is.na(avg_cost)) %>% 
   # Cross tab
    pivot_wider(names_from = PCCR, values_from = avg_cost) 

   # Cross tab dimensions
    print(dim(df_ip_cluster))
## [1] 703  59
   #  NAs to zero
   df_ip_cluster[is.na(df_ip_cluster)] <- 0
  
  df_ip_cluster <-  df_ip_cluster %>% 
    # select PCCR 3700 Operating Room & PCCR 4000 Anesthesiology
    mutate(Operating_Room = as.numeric(`3700`),
           Anesthesiology = as.numeric(`4000`)) %>% 
    # Create a new cost category by combining the PCCR 3700 Operating Room + PCCR 4000 Anesthesiology
    mutate(PCCR_OR_and_Anesth_Costs = Operating_Room + Anesthesiology) %>% 
    # plug in the DRG names to your cross-tabulated file
    left_join(DRG_Names, by = c("DRG" = "DRG")) %>% 
    ungroup() %>% 
    select(DRG_Name, Operating_Room, Anesthesiology,PCCR_OR_and_Anesth_Costs, everything())

  #
  head(df_ip_cluster, n=10)

3 Cluster analysis

Results with three clusters

# Cluster Analysis --------------------------------------------------------

  # k_means clustering
  df_ip_cluster_kmeans <-  df_ip_cluster %>% 
    select(PCCR_OR_and_Anesth_Costs) 
  
  # k = 2
  k_means_2 <- kmeans(df_ip_cluster_kmeans, 2)
  CHIndex_2 <- round(calinhara(df_ip_cluster_kmeans,k_means_2$cluster),3)
  wss_2 <- round(k_means_2$tot.withinss,3)
  
  # k = 3
  k_means_3 <- kmeans(df_ip_cluster_kmeans, 3)
  CHIndex_3 <- round(calinhara(df_ip_cluster_kmeans,k_means_3$cluster),3)
  wss_3 <- round(k_means_3$tot.withinss,3)
  
  # k = 4
  k_means_4 <- kmeans(df_ip_cluster_kmeans, 4)
  CHIndex_4 <- round(calinhara(df_ip_cluster_kmeans,k_means_4$cluster),3)
  wss_4 <- round(k_means_4$tot.withinss,3)
  
  # k = 5
  k_means_5 <- kmeans(df_ip_cluster_kmeans, 5)
  CHIndex_5 <- round(calinhara(df_ip_cluster_kmeans,k_means_5$cluster),3)
  wss_5 <- round(k_means_5$tot.withinss,3)
  
  # k_means summary table
  k      <- as.character(c(2,3,4,5))
  CHIndex <-  c(CHIndex_2, CHIndex_3, CHIndex_4, CHIndex_5)
  wss <- c(wss_2, wss_3, wss_4, wss_5)
  k_means_sum <- as.data.frame(cbind(k, CHIndex,wss))

  #  k_means = 3: create dataframe for cluster plot:
  df_ip_3_clusters <- df_ip_cluster_kmeans %>%
  # obtain clusters
    cbind(k_means_3$cluster) %>% 
  # obtain DRG names
    cbind(df_ip_cluster$DRG_Name) %>% 
  # renane variables
    mutate(cluster = as.character(`k_means_3$cluster`),
           DRG_Name =`df_ip_cluster$DRG_Name`) %>% 
    left_join(DRG_Names, by = c("DRG_Name" = "DRG_Name")) %>% 
  # Select only relevant columns
    select(`DRG`, DRG_Name, PCCR_OR_and_Anesth_Costs,cluster)
 
  # Note that DRG 274 is missing in the FILE_LAYOUT_AND_CODES.xlsf file although
  # it is found in the Inpatient file.We inpute the value for this DRG based on online resources
  # df_ip_3_clusters$DRG_Name[is.na(df_ip_3_clusters$DRG_Name)] <- "PERCUTANEOUS INTRACARDIAC PROCEDURES WITHOUT MCC"
  
head(df_ip_3_clusters, n=10)  

4 Cluster Plots

plot_1a <-  ggplot(df_ip_3_clusters, aes(x=DRG, y= PCCR_OR_and_Anesth_Costs, 
                              color = cluster)) + 
  geom_point(aes(text=DRG_Name)) +
  ggtitle("K-means Clustering: Inpatient DRGs") +
  xlab("DRG Code") +
  ylab("Average Cost-Operating Room and Anesthesiology ($)")+
  theme(
    legend.position = "right",
    panel.border = element_blank(),  
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "grey"))
plot_1b <- ggplotly(plot_1a)


# Plot 2 
k_means_sum_plots <- k_means_sum %>% 
mutate(k = as.numeric(as.character(k)),
       CHIndex = as.numeric(as.character(CHIndex)),
       wss = as.numeric(as.character(wss)))

plot_2 <- ggplot(data = k_means_sum_plots, aes(x=k, y = wss)) +
  geom_line(linetype = 2, colour = "blue", size=0.4 ) + 
  geom_point(shape=18, size =3, color ="blue")+
  xlab("K") +
  ylab("Total Within cluster Sum of Squares by cluster (TWSS)") +
  ggtitle("Scree plot") +
  theme(
    panel.border = element_blank(),  
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "grey"))


# Plot 3
plot_3 <- ggplot(data = k_means_sum_plots, aes(x=k, y =CHIndex )) +
  geom_line(linetype = 2, colour = "blue", size=0.4 ) + 
  geom_point(shape=18, size =3, color ="blue")+
  xlab("K") +
  ylab("Calinski-Harabasz F-statistics") +
  ggtitle("Optimal number of clusters") +
theme(
  panel.border = element_blank(),  
  panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(),
  panel.background = element_blank(),
  axis.line = element_line(colour = "grey"))

plot_1a

plot_1b
plot_2

plot_3